# coding=utf-8
'''
Created on 2012-8-26

@author: quanwei
'''
import urllib2
from core import Worker
import threading
import time

class HttpWorker(Worker.Worker):
    lock = threading.Lock()
    counter = 1
    maxCounter = 10
    def execute(self,task):
        Worker.Worker.execute(self, task)
        analyzer = task.getAnalyzer()
        html = self.getHtml(task)
        newTasks = analyzer.parseHtml(html,task)
        
        for newTask in newTasks:
            self.dispatcher.addTask(newTask)
    
    #抓取网页
    def getHtml(self,task):        
        HttpWorker.lock.acquire()        
        HttpWorker.counter = HttpWorker.counter+1
        #防止被封禁
        if HttpWorker.counter % self.maxCounter == 0:
            print ("counter=%s,sleep" %(HttpWorker.counter))
            time.sleep(1)
        HttpWorker.lock.release()
        print(task.url)
        req = urllib2.Request(task.url)
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.8 Safari/537.4")
        res = urllib2.urlopen( req )
        html = res.read()
        res.close()
        return html
         
        